Importing¶

In [ ]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import BaggingRegressor

from sklearn.metrics import root_mean_squared_error as rmse

from tqdm.auto import tqdm

import random

import salishsea_tools.viz_tools as sa_vi

Datasets Preparation¶

In [ ]:
def datasets_preparation(dataset, dataset2):
    
    drivers = np.stack([np.ravel(dataset['Temperature_(0m-15m)']),
        np.ravel(dataset['Temperature_(15m-100m)']), np.ravel(dataset['Salinity_(0m-15m)']),
        np.ravel(dataset['Salinity_(15m-100m)']), np.ravel(dataset2.Clusters_Drivers)])
    
    indx = np.where(~np.isnan(drivers).any(axis=0))
    drivers = drivers[:,indx[0]]

    diat = np.ravel(dataset['Diatom'])
    diat = diat[indx[0]]

    return(drivers, diat, indx)

Regressor¶

In [ ]:
def regressor (inputs, targets):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs = scale.fit_transform(inputs)
    X_train, _, y_train, _ = train_test_split(inputs, targets, train_size=0.35)

    drivers = None
    diat = None
    
    inputs = None
    targets = None

    model = MLPRegressor(hidden_layer_sizes=200)
    regr = BaggingRegressor(model, n_estimators=12, n_jobs=4).fit(X_train, y_train) 

    return (regr)

Regressor 2¶

In [ ]:
def regressor2 (inputs, targets, variable_name):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs2 = scale.fit_transform(inputs)

    outputs_test = regr.predict(inputs2)
   
    m = scatter_plot(targets, outputs_test, variable_name) 
    r = np.round(np.corrcoef(targets, outputs_test)[0][1],3)
    rms = rmse(targets, outputs_test)

    return (r, rms, m)

Regressor 3¶

In [ ]:
def regressor3 (inputs, targets):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs2 = scale.fit_transform(inputs)

    outputs_test = regr.predict(inputs2)
   
    # compute slope m and intercept b
    m, b = np.polyfit(targets, outputs_test, deg=1)
    
    r = np.round(np.corrcoef(targets, outputs_test)[0][1],3)
    rms = rmse(targets, outputs_test)

    return (r, rms, m)

Regressor 4¶

In [ ]:
def regressor4 (inputs, targets, variable_name):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs2 = scale.fit_transform(inputs)

    outputs = regr.predict(inputs2)

    # Post processing
    indx2 = np.full((len(diat_i.y)*len(diat_i.x)),np.nan)
    indx2[indx[0]] = outputs
    model = np.reshape(indx2,(len(diat_i.y),len(diat_i.x)))

    m = scatter_plot(targets, outputs, variable_name + str(dates[i].date())) 

    # Preparation of the dataarray 
    model = xr.DataArray(model,
        coords = {'y': diat_i.y, 'x': diat_i.x},
        dims = ['y','x'],
        attrs=dict( long_name = variable_name + "Concentration",
        units="mmol m-2"),)
                        
    plotting3(targets, model, diat_i, variable_name)

Printing¶

In [ ]:
def printing (targets, outputs, m):

    print ('The amount of data points is', outputs.size)
    print ('The slope of the best fitting line is ', np.round(m,3))
    print ('The correlation coefficient is:', np.round(np.corrcoef(targets, outputs)[0][1],3))
    print (' The root mean square error is:', rmse(targets,outputs))

Scatter Plot¶

In [ ]:
def scatter_plot(targets, outputs, variable_name):

    # compute slope m and intercept b
    m, b = np.polyfit(targets, outputs, deg=1)

    printing(targets, outputs, m)

    fig, ax = plt.subplots(2, figsize=(5,10), layout='constrained')

    ax[0].scatter(targets,outputs, alpha = 0.2, s = 10)

    lims = [np.min([ax[0].get_xlim(), ax[0].get_ylim()]),
        np.max([ax[0].get_xlim(), ax[0].get_ylim()])]

    # plot fitted y = m*x + b
    ax[0].axline(xy1=(0, b), slope=m, color='r')

    ax[0].set_xlabel('targets')
    ax[0].set_ylabel('outputs')
    ax[0].set_xlim(lims)
    ax[0].set_ylim(lims)
    ax[0].set_aspect('equal')

    ax[0].plot(lims, lims,linestyle = '--',color = 'k')

    h = ax[1].hist2d(targets,outputs, bins=100, cmap='jet', 
        range=[lims,lims], cmin=0.1, norm='log')
    
    ax[1].plot(lims, lims,linestyle = '--',color = 'k')

    # plot fitted y = m*x + b
    ax[1].axline(xy1=(0, b), slope=m, color='r')

    ax[1].set_xlabel('targets')
    ax[1].set_ylabel('outputs')
    ax[1].set_aspect('equal')

    fig.colorbar(h[3],ax=ax[1], location='bottom')

    fig.suptitle(variable_name)

    plt.show()

    return (m)

Plotting¶

In [ ]:
def plotting(variable, name):

    plt.plot(years,variable, marker = '.', linestyle = '')
    plt.xlabel('Years')
    plt.ylabel(name)
    plt.show()

Plotting 2¶

In [ ]:
def plotting2(variable,title):
    
    fig, ax = plt.subplots()

    scatter= ax.scatter(dates,variable, marker='.', c=pd.DatetimeIndex(dates).month)

    ax.legend(handles=scatter.legend_elements()[0], labels=['February','March','April'])
    fig.suptitle('Daily ' + title + ' (15 Feb - 30 Apr)')
    
    fig.show()

Plotting 3¶

In [ ]:
def plotting3(targets, model, variable, variable_name):

    fig, ax = plt.subplots(2,2, figsize = (10,15))

    cmap = plt.get_cmap('cubehelix')
    cmap.set_bad('gray')

    variable.plot(ax=ax[0,0], cmap=cmap, vmin = targets.min(), vmax =targets.max(), cbar_kwargs={'label': variable_name + ' Concentration  [mmol m-2]'})
    model.plot(ax=ax[0,1], cmap=cmap, vmin = targets.min(), vmax = targets.max(), cbar_kwargs={'label': variable_name + ' Concentration  [mmol m-2]'})
    ((variable-model) / variable * 100).plot(ax=ax[1,0], cmap=cmap, cbar_kwargs={'label': variable_name + ' Concentration  [percentage]'})

    plt.subplots_adjust(left=0.1,
        bottom=0.1, 
        right=0.95, 
        top=0.95, 
        wspace=0.35, 
        hspace=0.35)

    sa_vi.set_aspect(ax[0,0])
    sa_vi.set_aspect(ax[0,1])
    sa_vi.set_aspect(ax[1,0])


    ax[0,0].title.set_text(variable_name + ' (targets)')
    ax[0,1].title.set_text(variable_name + ' (outputs)')
    ax[1,0].title.set_text('targets - outputs')
    ax[1,1].axis('off')

    fig.suptitle(str(dates[i].date()))

    plt.show()
    

Training (Random Points)¶

In [ ]:
ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_model_var_old.nc')

ds = ds.isel(time_counter = (np.arange(0, len(ds.Diatom.time_counter),2)), 
    y=(np.arange(ds.y[0], ds.y[-1], 5)), 
    x=(np.arange(ds.x[0], ds.x[-1], 5)))

ds_clusters = xr.open_dataset('/data/ibougoudis/MOAD/files/clustering.nc')

ds_clusters = ds_clusters.isel(time_counter = 
    (np.arange(0, len(ds_clusters.time_counter),2)), 
    y=(np.arange(ds_clusters.y[0], ds_clusters.y[-1], 5)), 
    x=(np.arange(ds_clusters.x[0], ds_clusters.x[-1], 5)))

dates = pd.DatetimeIndex(ds['time_counter'].values)

drivers, diat, _ = datasets_preparation(ds, ds_clusters)

regr = regressor(drivers, diat)

Other Years (Anually)¶

In [ ]:
years = range (2007,2024)

r_all = []
rms_all = []
slope_all = []

for year in tqdm(range (2007,2024)):
    
    dataset = ds.sel(time_counter=str(year))
    dataset2 = ds_clusters.sel(time_counter=str(year))
    
    drivers, diat, _ = datasets_preparation(dataset, dataset2)

    r, rms, m = regressor2(drivers, diat, 'Diatom ' + str(year))
    
    r_all.append(r)
    rms_all.append(rms)
    slope_all.append(m)
    
plotting(np.transpose(r_all), 'Correlation Coefficient')
plotting(np.transpose(rms_all), 'Root Mean Square Error')
plotting (np.transpose(slope_all), 'Slope of the best fitting line')
  0%|          | 0/17 [00:00<?, ?it/s]
The amount of data points is 70794
The slope of the best fitting line is  0.493
The correlation coefficient is: 0.677
 The root mean square error is: 0.11806228105889482
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.494
The correlation coefficient is: 0.65
 The root mean square error is: 0.11196097221143426
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.338
The correlation coefficient is: 0.596
 The root mean square error is: 0.15902004223463445
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.403
The correlation coefficient is: 0.573
 The root mean square error is: 0.12065770189285961
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.445
The correlation coefficient is: 0.605
 The root mean square error is: 0.13252828642623196
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.451
The correlation coefficient is: 0.684
 The root mean square error is: 0.1161981562078246
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.329
The correlation coefficient is: 0.599
 The root mean square error is: 0.14456476945444421
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.412
The correlation coefficient is: 0.587
 The root mean square error is: 0.11877706548226002
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.194
The correlation coefficient is: 0.266
 The root mean square error is: 0.16380165299139948
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.313
The correlation coefficient is: 0.449
 The root mean square error is: 0.15758694866370293
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.497
The correlation coefficient is: 0.632
 The root mean square error is: 0.10768095995628144
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.304
The correlation coefficient is: 0.462
 The root mean square error is: 0.14348345412355115
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.29
The correlation coefficient is: 0.477
 The root mean square error is: 0.1546765574767073
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.266
The correlation coefficient is: 0.529
 The root mean square error is: 0.17496883042179392
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.406
The correlation coefficient is: 0.648
 The root mean square error is: 0.1322111116293417
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.447
The correlation coefficient is: 0.632
 The root mean square error is: 0.11373089859857413
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.275
The correlation coefficient is: 0.435
 The root mean square error is: 0.15992062960794656
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Other Years (Daily)¶

In [ ]:
r_all2 = np.array([])
rms_all2 = np.array([])
slope_all2 = np.array([])

for i in tqdm(range (0, len(ds.time_counter))):
    
    dataset = ds.isel(time_counter=i)
    dataset2 = ds_clusters.isel(time_counter=i)

    drivers, diat, _ = datasets_preparation(dataset, dataset2)

    r, rms, m = regressor3(drivers, diat)

    r_all2 = np.append(r_all2,r)
    rms_all2 = np.append(rms_all2,rms)
    slope_all2 = np.append(slope_all2,m)

plotting2(r_all2, 'Correlation Coefficients')
plotting2(rms_all2, 'Root Mean Square Errors')
plotting2(slope_all2, 'Slope of the best fitting line')
  0%|          | 0/640 [00:00<?, ?it/s]
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Daily Maps¶

In [ ]:
maps = random.sample(range(0,len(ds.time_counter)),10)

for i in tqdm(maps):

    dataset = ds.isel(time_counter=i)
    dataset2 = ds_clusters.isel(time_counter=i)
    drivers, diat, indx = datasets_preparation(dataset, dataset2)

    diat_i = dataset['Diatom']

    regressor4(drivers, diat, 'Diatom ')
  0%|          | 0/10 [00:00<?, ?it/s]
The amount of data points is 1863
The slope of the best fitting line is  0.495
The correlation coefficient is: 0.275
 The root mean square error is: 0.11501985097299285
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.311
The correlation coefficient is: 0.34
 The root mean square error is: 0.17710373497790888
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.078
The correlation coefficient is: 0.065
 The root mean square error is: 0.1651837115578955
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.274
The correlation coefficient is: 0.199
 The root mean square error is: 0.12493599714398045
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  1.053
The correlation coefficient is: 0.663
 The root mean square error is: 0.12740912832697868
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.481
The correlation coefficient is: 0.179
 The root mean square error is: 0.19128318013767068
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  1.728
The correlation coefficient is: 0.751
 The root mean square error is: 0.22062290928885717
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.215
The correlation coefficient is: 0.239
 The root mean square error is: 0.2729246525620645
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  1.274
The correlation coefficient is: 0.525
 The root mean square error is: 0.20411716960570403
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.226
The correlation coefficient is: 0.384
 The root mean square error is: 0.30664120949041757
No description has been provided for this image
No description has been provided for this image
In [ ]: